## Setup

# clear workspace
rm(list = ls()); gc()

# load packages
library(tidyverse)
library(survey)
library(stargazer)
library(ggplot2)
library(sjPlot)
library(ggpubr)
library(estimatr)
library(interflex)
library(specr)
library(parallel)
library(MASS)
library(dplyr)
library(car)
library(effects)
library(lmtest)
library(sandwich)
library(readxl)

# set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))


# ~~~~~~~~~~


# This script takes about 30 hours to run using parallelization


# ~~~~~~~~~~


## Prep data

# load data, filtering out 1 observation with missing weight
setwd("../data/processed")
data <- readRDS("master_data.rds") %>% filter(!is.na(weight))

# create string of weeks in which sample is registered voters
weeks_voters <- c("2020-09-26", "2020-10-03", "2020-10-10", "2020-10-17", "2020-10-24", "2020-10-31", "2020-11-07", "2020-11-14", "2020-11-21")

# create string of weeks in which sample is adult American citizens
weeks_citizens <- c("2020-05-23", "2020-05-30", "2020-06-06", "2020-06-13", "2020-08-15", "2020-08-22", "2020-08-29", "2020-09-05", "2020-09-12", "2020-09-19")

# create per-capita hospital beds variable
data$beds_per_10k <- data$beds/(data$population/10000)

# create per-capita COVID cases and deaths variables
data$cases_per_10k <- data$cases/(data$population/10000)
data$deaths_per_10k <- data$deaths/(data$population/10000)
data$cases_30_per_10k <- data$cases_30/(data$population/10000)
data$deaths_30_per_10k <- data$deaths_30/(data$population/10000)

# create weekly percentile COVID cases and deaths variables using per-capita cases and deaths
data <- data %>% group_by(surveydate) %>% mutate(covid_1 = ntile(cases_per_10k, 100))
data <- data %>% group_by(surveydate) %>% mutate(covid_2 = ntile(deaths_per_10k, 100))
data <- data %>% group_by(surveydate) %>% mutate(covid_3 = ntile(cases_30_per_10k, 100))
data <- data %>% group_by(surveydate) %>% mutate(covid_4 = ntile(deaths_30_per_10k, 100))

# create natural logged COVID cases and deaths variables | make any negatives equal to 0
# these occur when locals correct previously fatality counts and thus include daily negative values
data$log_cases <- log(data$cases_per_10k + 1)
data$log_deaths <- log(data$deaths_per_10k + 1)
data$log_cases_30 <- ifelse(data$cases_30_per_10k >= 0, log(data$cases_30_per_10k + 1), 0)
data$log_deaths_30 <- ifelse(data$deaths_30_per_10k >= 0, log(data$deaths_30_per_10k + 1), 0)

# transform mask variable ("Have you worn a face mask in public?") to binary (asked weekly 4/4 - 8/22)
data$cor_pik_facemask_1 <- ifelse(data$cor_pik_facemask_1 == 1, 1, 
                                  ifelse(data$cor_pik_facemask_1 == 2, 0, NA))

# transform Trump support ("How much do you think Donald Trump cares about the needs and problems of people like you?") to binary
data$trump <- ifelse(data$care_dtrmp == 1 | data$care_dtrmp == 2, 1, # a lot | some
                     ifelse(data$care_dtrmp == 3 | data$care_dtrmp == 4, 0, NA)) # not much | doesn't care at all | counts "not sure"s as NAs

# create period indicator variables to indicate four periods in which the response options labeled as under-predictions remain the same
data$period_1 <- ifelse(data$surveydate %in% c("2020-05-23", "2020-05-30", "2020-06-06", "2020-06-13"), 1, 0)
data$period_2 <- ifelse(data$surveydate %in% c("2020-08-15", "2020-08-22", "2020-08-29", "2020-09-05", "2020-09-12"), 1, 0)
data$period_3 <- ifelse(data$surveydate %in% c("2020-09-19", "2020-09-26", "2020-10-03", "2020-10-10", "2020-10-17", "2020-10-24", "2020-10-31", "2020-11-07"), 1, 0)

# create new data frames that limit the data to only the 2.5th to 97.5th COVID incidence counties ~ per survey week ~ 
filtered_data_cases <- data %>%
  group_by(surveydate) %>%
  filter(between(cases_per_10k, quantile(cases_per_10k, 0.025), quantile(cases_per_10k, 0.975)))

filtered_data_deaths <- data %>%
  group_by(surveydate) %>%
  filter(between(deaths_per_10k, quantile(deaths_per_10k, 0.025), quantile(deaths_per_10k, 0.975)))

# survey design objects
d <- svydesign(ids = ~FIPS, weights = ~weight, data = data)
d1_cases <- svydesign(ids = ~FIPS, weights = ~weight, data = filtered_data_cases)
d1_deaths <- svydesign(ids = ~FIPS, weights = ~weight, data = filtered_data_deaths); rm(filtered_data_cases, filtered_data_deaths)


# ~~~~~~~~~~


## Appendix A.1: Sample characteristics

# initialize table
output <- data.frame()

# gender
output <- bind_rows(output, cbind(data %>% ungroup() %>% count(female) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop),
                                  data %>% ungroup() %>% count(female, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop),
                                  data %>% ungroup() %>% filter(surveydate %in% weeks_citizens) %>% count(female, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop),
                                  data %>% ungroup() %>% filter(surveydate %in% weeks_voters) %>% count(female, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop)))

# education
data$educ_cat <- ifelse(data$educ == 1, 1, # no HS
                        ifelse(data$educ == 2, 2, # HS
                               ifelse(data$educ == 3 | data$educ == 4, 3, # associates or some college 
                                      ifelse(data$educ == 5, 4, # college
                                             ifelse(data$educ == 6, 5, NA))))) # post-graduate

output <- bind_rows(output, cbind(data %>% ungroup() %>% count(educ_cat) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop),
                                  data %>% ungroup() %>% count(educ_cat, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop),
                                  data %>% ungroup() %>% filter(surveydate %in% weeks_citizens) %>% count(educ_cat, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop),
                                  data %>% ungroup() %>% filter(surveydate %in% weeks_voters) %>% count(educ_cat, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop)))

# race
output <- bind_rows(output, cbind(data %>% ungroup() %>% count(race) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop),
                                  data %>% ungroup() %>% count(race, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop),
                                  data %>% ungroup() %>% filter(surveydate %in% weeks_citizens) %>% count(race, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop),
                                  data %>% ungroup() %>% filter(surveydate %in% weeks_voters) %>% count(race, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop)))

# age
data <- data %>% mutate(age_cat_census = cut(age, breaks = c(20,29,44,64,max(age)),
                                             labels = c("20-29", "30-44", "45-64", "65 and older"),
                                             include.lowest = TRUE))

output <- bind_rows(output, cbind(data %>% ungroup() %>% filter(!is.na(age_cat_census)) %>% count(age_cat_census) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop),
                                  data %>% ungroup() %>% filter(!is.na(age_cat_census)) %>% count(age_cat_census, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop), 
                                  data %>% ungroup() %>% filter(!is.na(age_cat_census) & surveydate %in% weeks_citizens) %>% count(age_cat_census, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop), 
                                  data %>% ungroup() %>% filter(!is.na(age_cat_census) & surveydate %in% weeks_voters) %>% count(age_cat_census, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop)))

# income
data <- data %>% mutate(inc_cat = cut(faminc, breaks = c(0,1,2,5,9,16),
                                      labels = c("<$10k", "$10k-<$20k", "$20k-<$50k",
                                                 "$50k-<$100k", "$100k+")))

output <- bind_rows(output, cbind(data %>% ungroup() %>% filter(!is.na(inc_cat)) %>% count(inc_cat) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop), 
                                  data %>% ungroup() %>% filter(!is.na(inc_cat))  %>% count(inc_cat, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop), 
                                  data %>% ungroup() %>% filter(!is.na(inc_cat) & surveydate %in% weeks_citizens) %>% count(inc_cat, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop), 
                                  data %>% ungroup() %>% filter(!is.na(inc_cat) & surveydate %in% weeks_voters) %>% count(inc_cat, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop)))

# Census region
output <- bind_rows(output, cbind(data %>% ungroup() %>% count(region) %>% mutate(prop = round(100*n/sum(n),1)) %>% dplyr::select(prop),
                                  data %>% ungroup() %>% count(region, wt = weight) %>% mutate(prop = round(100*n/sum(n),1)) %>% dplyr::select(prop),
                                  data %>% ungroup() %>% filter(surveydate %in% weeks_citizens) %>% count(region, wt = weight) %>% mutate(prop = round(100*n/sum(n),1)) %>% dplyr::select(prop),
                                  data %>% ungroup() %>% filter(surveydate %in% weeks_voters) %>% count(region, wt = weight) %>% mutate(prop = round(100*n/sum(n),1)) %>% dplyr::select(prop)))

# partisan identity
output <- bind_rows(output, cbind(data %>% ungroup() %>% count(pid7) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop),
                                  data %>% ungroup() %>% count(pid7, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop), 
                                  data %>% ungroup() %>% filter(surveydate %in% weeks_citizens) %>% count(pid7, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop), 
                                  data %>% ungroup() %>% filter(surveydate %in% weeks_voters) %>% count(pid7, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop)))

# political ideology (first collapse not sure into moderate)
data <- data %>% mutate(ideo5_2 = if_else(ideo5 == "Not sure", "Moderate", ideo5))

output <- bind_rows(output, cbind(data %>% ungroup() %>% count(ideo5_2) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop), 
                                  data %>% ungroup() %>% count(ideo5_2, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop), 
                                  data %>% ungroup() %>% filter(surveydate %in% weeks_citizens) %>% count(ideo5_2, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop), 
                                  data %>% ungroup() %>% filter(surveydate %in% weeks_voters) %>% count(ideo5_2, wt = weight) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(prop)))

# format table
output <- output %>%
  rename(Unweighted = `prop...1`, Weighted = `prop...2`, Citizens = `prop...3`, Voters = `prop...4`) %>%
  mutate(category = c(rep("Sex", 2), rep("Education", 5), rep("Race", 4), rep("Age", 4), rep("Income", 5), rep("Census region", 4), rep("Partisan identity", 7), rep("Political ideology", 5)),
         item = c("Male", "Female", 
                  "No high school", "High school", "Some college or associates", "4-year college", "Post-graduate", 
                  "White", "Black", "Hispanic", "Other",
                  "20-29", "30-34", "45-64", "65+",
                  "< $10,000", "$10,000 - $19,999", "$20,000 - $49,999", "$50,000 - $99,999", "$100,000+",
                  "Northeast", "Midwest", "South", "West", 
                  t(data %>% ungroup() %>% count(pid7) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(pid7)),
                  t(data %>% ungroup() %>% count(ideo5_2) %>% mutate(prop = round(100*(n/sum(n)),1)) %>% dplyr::select(ideo5_2))))

# save table
setwd("../../output")
write.csv(output, "Table A.1.csv", row.names = F); rm(output, weeks_citizens, weeks_voters)

# what proportion of respondents say "prefer not to say" to income question
writeLines(paste("Proportion of respondents who prefer not to respond to family income item:", round(sum(data$faminc == 97)/nrow(data),3)*100), "Appendix A.1.txt")


# ~~~~~~~~~~


## Appendix A.3: Effect of over- and underestimating on mask wearing

# write and save proportion of respondents who self-report wearing a face mask
writeLines(paste("Proportion of respondents who wear face masks", round(mean(data$cor_pik_facemask_1, na.rm = T),3)), "Appendix A.3.txt")

# run regressions

# under | individual-level controls
fit_A3.1 <- lm_robust(cor_pik_facemask_1 ~ under + week + period_1 + age + female + educ3 + race + inc3 + pid7 + ideo5 + trump, 
                      data = data, weights = weight, clusters = FIPS, se_type = "CR2")

# under | individual- and district-level controls
fit_A3.2 <- lm_robust(cor_pik_facemask_1 ~ under + week + period_1 + age + female + educ3 + race + inc3 + pid7 + ideo5 + trump +
                        beds_per_10k + rural_urban_continuum + bachelors + unemployment + median_household_income + rep_share + black + hispanic, 
                      data = data, weights = weight, clusters = FIPS, se_type = "CR2")

# under | individual- and district-level controls plus COVID incidence and media attention
fit_A3.3 <- lm_robust(cor_pik_facemask_1 ~ under + week + period_1 + age + female + educ3 + race + inc3 + pid7 + ideo5 + trump +
                        beds_per_10k + rural_urban_continuum + bachelors + unemployment + median_household_income + rep_share + black + hispanic + 
                        log_deaths*covid_nonpartisan_estimate, 
                      data = data, weights = weight, clusters = FIPS, se_type = "CR2")

# under | logistic regression
fit_A3.4 <- svyglm(cor_pik_facemask_1 ~ under + week + period_1 + age + female + educ3 + race + inc3 + pid7 + ideo5 + trump +
                     beds_per_10k + rural_urban_continuum + bachelors + unemployment + median_household_income + rep_share + black + hispanic +
                     log_deaths*covid_nonpartisan_estimate,
                   design = d, family = quasibinomial(link = "logit"), rescale = T)

# extract values
sum_fit1 <- as.data.frame(summary(fit_A3.1)$coefficient) %>% mutate(coef = row.names(.)) %>% filter(coef == "under") %>% dplyr::select(-c(`t value`, `CI Lower`, `CI Upper`, DF))
sum_fit2 <- as.data.frame(summary(fit_A3.2)$coefficient) %>% mutate(coef = row.names(.)) %>% filter(coef == "under") %>% dplyr::select(-c(`t value`, `CI Lower`, `CI Upper`, DF))
sum_fit3 <- as.data.frame(summary(fit_A3.3)$coefficient) %>% mutate(coef = row.names(.)) %>% filter(coef == "under") %>% dplyr::select(-c(`t value`, `CI Lower`, `CI Upper`, DF))
sum_fit4 <- as.data.frame(summary(fit_A3.4)$coefficient) %>% mutate(coef = row.names(.)) %>% filter(coef == "under") %>% dplyr::select(-`t value`)
rownames(sum_fit1) <- rownames(sum_fit2) <- rownames(sum_fit3) <- rownames(sum_fit4) <- NULL

# make table
tab <- bind_rows(sum_fit1, sum_fit2, sum_fit3, sum_fit4) %>%
  mutate(Estimate = round(Estimate, 3),
         `Std. Error` = round(`Std. Error`, 3),
         `Pr(>|t|)` = round(`Pr(>|t|)`, 3),
         Observations = c(nobs(fit_A3.1), nobs(fit_A3.2), nobs(fit_A3.3), nobs(fit_A3.4)))

# save
write.csv(tab, "Table A.2.csv", row.names = F); rm(tab, fit_A3.1, fit_A3.2, fit_A3.3, fit_A3.4, sum_fit1, sum_fit2, sum_fit3, sum_fit4)


# ~~~~~~~~~~


## Appendix A.6: Direct effect of media attention and local incidence

# Table A.3

# initialize data frame to store output
tab <- data.frame()

# regressions
fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_deaths + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimating", "log_deaths", round(summary(fit)$coefficients["log_deaths","Estimate"],3), 
                        round(summary(fit)$coefficients["log_deaths","Std. Error"],3),
                        round(summary(fit)$coefficients["log_deaths","Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_deaths_30 + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimating", "log_deaths_30", round(summary(fit)$coefficients["log_deaths_30","Estimate"],3), 
                        round(summary(fit)$coefficients["log_deaths_30","Std. Error"],3),
                        round(summary(fit)$coefficients["log_deaths_30","Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_cases + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimating", "log_cases", round(summary(fit)$coefficients["log_cases","Estimate"],3), 
                        round(summary(fit)$coefficients["log_cases","Std. Error"],3),
                        round(summary(fit)$coefficients["log_cases","Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_cases_30 + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimating", "log_cases_30", round(summary(fit)$coefficients["log_cases_30","Estimate"],3), 
                        round(summary(fit)$coefficients["log_cases_30","Std. Error"],3),
                        round(summary(fit)$coefficients["log_cases_30","Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_deaths + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimating", "log_deaths", round(summary(fit)$coefficients["log_deaths","Estimate"],3), 
                        round(summary(fit)$coefficients["log_deaths","Std. Error"],3),
                        round(summary(fit)$coefficients["log_deaths","Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_deaths_30 + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimating", "log_deaths_30", round(summary(fit)$coefficients["log_deaths_30","Estimate"],3), 
                        round(summary(fit)$coefficients["log_deaths_30","Std. Error"],3),
                        round(summary(fit)$coefficients["log_deaths_30","Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_cases + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimating", "log_cases", round(summary(fit)$coefficients["log_cases","Estimate"],3), 
                        round(summary(fit)$coefficients["log_cases","Std. Error"],3),
                        round(summary(fit)$coefficients["log_cases","Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_cases_30 + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimating", "log_cases_30", round(summary(fit)$coefficients["log_cases_30","Estimate"],3), 
                        round(summary(fit)$coefficients["log_cases_30","Std. Error"],3),
                        round(summary(fit)$coefficients["log_cases_30","Pr(>|t|)"],3)))

names(tab) <- c("DV", "covid incidence", "estimate", "se", "p value")
tab$estimate <- as.numeric(tab$estimate)
tab$`p value` <- as.numeric(tab$`p value`)
tab$se <- as.numeric(tab$se)

# generate and save table
writeLines(stargazer(tab, summary = F), "Table A.3.txt"); rm(tab, fit)


## Direct effect of media attention

# run regressions
fit_A6.1 <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                     covid_nonpartisan_estimate + 
                     unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                     age + female + educ3 + race + inc3 + pid7 + ideo5, 
                   design = d, family = quasibinomial(link = "logit"), rescale = T)

fit_A6.2 <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                     covid_nonpartisan_estimate + 
                     unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                     age + female + educ3 + race + inc3 + pid7 + ideo5, 
                   design = d, family = quasibinomial(link = "logit"), rescale = T)

# extract values and save as txt file
writeLines(paste(paste("Coefficient of media salience on underestimations:", round(summary(fit_A6.1)$coefficients["covid_nonpartisan_estimate", "Estimate"],2)),
                 paste("P value of media salience on underestimations:", round(summary(fit_A6.1)$coefficients["covid_nonpartisan_estimate", "Pr(>|t|)"],3)),
                 paste("Coefficient of media salience on underestimations:", round(summary(fit_A6.2)$coefficients["covid_nonpartisan_estimate", "Estimate"],2)),
                 paste("P value of media salience on underestimations:", round(summary(fit_A6.2)$coefficients["covid_nonpartisan_estimate", "Pr(>|t|)"],3)), sep = "\n"), "Appendix A.6.txt"); rm(fit_A6.1, fit_A6.2)


# ~~~~~~~~~~


## Appendix A.8: Alternative specifications of local incidence

# create data frame to store output
tab <- data.frame()

# under ~ deaths

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_deaths*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5,
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimation", "logged deaths", round(summary(fit)$coefficients["log_deaths:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["log_deaths:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["log_deaths:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_deaths_30*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimation", "logged deaths last 30", round(summary(fit)$coefficients["log_deaths_30:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["log_deaths_30:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["log_deaths_30:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                deaths_per_10k*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimation", "deaths", round(summary(fit)$coefficients["deaths_per_10k:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["deaths_per_10k:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["deaths_per_10k:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                deaths_30_per_10k*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimation", "deaths last 30", round(summary(fit)$coefficients["deaths_30_per_10k:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["deaths_30_per_10k:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["deaths_30_per_10k:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                covid_2*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimation", "percentile deaths", round(summary(fit)$coefficients["covid_2:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["covid_2:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["covid_2:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                deaths_per_10k*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d1_deaths, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimation", "trimmed deaths", round(summary(fit)$coefficients["deaths_per_10k:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["deaths_per_10k:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["deaths_per_10k:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

# under ~ cases

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_cases*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5,
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimation", "logged cases", round(summary(fit)$coefficients["log_cases:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["log_cases:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["log_cases:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_cases_30*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimation", "logged cases last 30", round(summary(fit)$coefficients["log_cases_30:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["log_cases_30:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["log_cases_30:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                cases_per_10k*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimation", "cases", round(summary(fit)$coefficients["cases_per_10k:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["cases_per_10k:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["cases_per_10k:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                cases_30_per_10k*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimation", "cases last 30", round(summary(fit)$coefficients["cases_30_per_10k:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["cases_30_per_10k:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["cases_30_per_10k:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                covid_1*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimation", "percentile cases", round(summary(fit)$coefficients["covid_1:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["covid_1:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["covid_1:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                cases_per_10k*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d1_cases, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimation", "trimmed cases", round(summary(fit)$coefficients["cases_per_10k:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["cases_per_10k:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["cases_per_10k:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

# over ~ deaths

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_deaths*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5,
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimation", "logged deaths", round(summary(fit)$coefficients["log_deaths:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["log_deaths:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["log_deaths:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_deaths_30*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimation", "logged deaths last 30", round(summary(fit)$coefficients["log_deaths_30:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["log_deaths_30:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["log_deaths_30:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                deaths_per_10k*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimation", "deaths", round(summary(fit)$coefficients["deaths_per_10k:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["deaths_per_10k:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["deaths_per_10k:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                deaths_30_per_10k*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimation", "deaths last 30", round(summary(fit)$coefficients["deaths_30_per_10k:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["deaths_30_per_10k:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["deaths_30_per_10k:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                covid_2*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimation", "percentile deaths", round(summary(fit)$coefficients["covid_2:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["covid_2:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["covid_2:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                deaths_per_10k*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d1_deaths, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimation", "trimmed deaths", round(summary(fit)$coefficients["deaths_per_10k:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["deaths_per_10k:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["deaths_per_10k:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

# over ~ cases

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_cases*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5,
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimation", "logged cases", round(summary(fit)$coefficients["log_cases:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["log_cases:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["log_cases:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_cases_30*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimation", "logged cases last 30", round(summary(fit)$coefficients["log_cases_30:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["log_cases_30:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["log_cases_30:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                cases_per_10k*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimation", "cases", round(summary(fit)$coefficients["cases_per_10k:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["cases_per_10k:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["cases_per_10k:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                cases_30_per_10k*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimation", "cases last 30", round(summary(fit)$coefficients["cases_30_per_10k:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["cases_30_per_10k:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["cases_30_per_10k:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                covid_1*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimation", "percentile cases", round(summary(fit)$coefficients["covid_1:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["covid_1:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["covid_1:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                cases_per_10k*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d1_cases, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimation", "trimmed cases", round(summary(fit)$coefficients["cases_per_10k:covid_nonpartisan_estimate","Estimate"],3), 
                        round(summary(fit)$coefficients["cases_per_10k:covid_nonpartisan_estimate","Std. Error"],3),
                        round(summary(fit)$coefficients["cases_per_10k:covid_nonpartisan_estimate","Pr(>|t|)"],3)))

# generate table
names(tab) <- c("DV", "covid incidence", "estimate", "se", "p value")
tab$estimate <- as.numeric(tab$estimate)
tab$`p value` <- as.numeric(tab$`p value`)
tab$se <- as.numeric(tab$se)

# generate and save table
writeLines(stargazer(tab, summary = F), "Table A.5.txt"); rm(fit, tab, d1_cases, d1_deaths)


# ~~~~~~~~~~


## Appendix A.9: Specification curve analysis

# "bivariate" relationships
fit1 <- svyglm(under ~ week + period_1 + period_2 + period_3 + log_deaths*covid_nonpartisan_estimate, 
               design = d, family = quasibinomial(link = "logit"), rescale = T)

fit2 <- svyglm(under ~ week + period_1 + period_2 + period_3 + log_deaths_30*covid_nonpartisan_estimate, 
               design = d, family = quasibinomial(link = "logit"), rescale = T)

# extract coefficients and p values and save as txt file
writeLines(paste(paste("Estimate logged deaths:", round(summary(fit1)$coefficients["log_deaths:covid_nonpartisan_estimate","Estimate"],3)),
                 paste("P value logged deaths:", round(summary(fit1)$coefficients["log_deaths:covid_nonpartisan_estimate","Pr(>|t|)"],3)), 
                 paste("Estimate logged deaths last 30 days:", round(summary(fit2)$coefficients["log_deaths_30:covid_nonpartisan_estimate","Estimate"],3)), 
                 paste("P value logged deaths last 30 days:", round(summary(fit2)$coefficients["log_deaths_30:covid_nonpartisan_estimate","Pr(>|t|)"],3)), sep = "\n"), "Appendix A.9.txt"); rm(fit1, fit2)


# (1) logged deaths including county-level controls

# create variable "deaths_by_media" that is the interaction of local incidence and national
# news salience, between specification curve doesn't like the interaction term
data$deaths_by_media <- data$log_deaths*data$covid_nonpartisan_estimate

# set up specifications
specs <- setup(data = data,
               x = c("deaths_by_media"),
               y = "under",
               model = "cust",
               controls = c("age", "female", "educ3", "race", "inc3", "pid7", "ideo5", 
                            "median_household_income", "unemployment", "beds_per_10k",
                            "rural_urban_continuum", "bachelors", "rep_share", "black", "hispanic"),
               add_to_formula = c("week + period_1 + period_2 + period_3 + log_deaths + covid_nonpartisan_estimate")) # include these in all models

# create svyglm model for input
cust <- function(formula, data){
  svyglm(formula, 
         design = svydesign(ids = ~FIPS, weights = ~weight, data = data),
         family = quasibinomial(link = "logit"), 
         rescale = T)
}

# set up parallel computing
cl <- makeCluster(detectCores() - 1)
clusterEvalQ(cl, {
  library(specr)
})
clusterExport(cl, list("cust", "specs"))

# run analysis | this takes about 10 hours
start_time <- Sys.time()
results <- specr(specs)
end_time <- Sys.time(); print(end_time - start_time)

# plot estimates (Figure A.5)
p <- plot(results, type = "curve") + geom_hline(yintercept = 0, linetype = "dashed")

# save
ggsave(filename = "Figure A.5.png", plot = p, device = "png", width = 18, height = 11, units = "in", bg = "white")

# extract relevant values and save as txt file
writeLines(paste(paste("Number of permutations:", length(specs$specs$formula)), 
                 paste("Proportion of permutations (logged deaths) that are negative:", sum(results$data$estimate < 0)/nrow(results$data)),
                 paste("Proportion of permutations (logged deaths) that are negative and statistically significant:", sum(results$data$p.value <= 0.05 & results$data$estimate < 0)/nrow(results$data)),
                 paste("Average estimate (logged deaths):", round(mean(results$data$estimate),3)), sep = "\n"), "Appendix A.9 spec curve logged deaths.txt")

# stop the cluster
stopCluster(cl)


# (2) logged cases including county-level controls

# create variable "cases_by_media" that is the interaction of local incidence and national
# news salience, between specification curve doesn't like the interaction term
data$cases_by_media <- data$log_cases*data$covid_nonpartisan_estimate

# set up specifications
specs <- setup(data = data,
               x = c("cases_by_media"),
               y = "under",
               model = "cust",
               controls = c("age", "female", "educ3", "race", "inc3", "pid7", "ideo5",
                            "median_household_income", "unemployment", "beds_per_10k",
                            "rural_urban_continuum", "bachelors", "rep_share", "black", "hispanic"),
               add_to_formula = c("week + period_1 + period_2 + period_3 + log_cases + covid_nonpartisan_estimate")) # include these in all models

# set up parallel computing
cl <- makeCluster(detectCores() - 1)
clusterEvalQ(cl, {
  library(specr)
})
clusterExport(cl, list("cust", "specs"))

# run analysis | this takes about 10 hours
start_time <- Sys.time(); print(start_time)
results <- specr(specs)
end_time <- Sys.time(); print(end_time - start_time)

# plot estimates (Figure A.6)
p <- plot(results, type = "curve") + geom_hline(yintercept = 0, linetype = "dashed")

# save
ggsave(filename = "Figure A.6.png", plot = p, device = "png", width = 18, height = 11, units = "in", bg = "white")

# extract relevant values and save as txt file
writeLines(paste(paste("Number of permutations:", length(specs$specs$formula)), 
                 paste("Proportion of permutations (logged deaths) that are negative:", sum(results$data$estimate < 0)/nrow(results$data)),
                 paste("Proportion of permutations (logged deaths) that are negative and statistically significant:", sum(results$data$p.value <= 0.05 & results$data$estimate < 0)/nrow(results$data)),
                 paste("Average estimate (logged deaths):", round(mean(results$data$estimate),3)), sep = "\n"), "Appendix A.9 spec curve logged cases.txt")

# stop the cluster
stopCluster(cl); rm(cust, specs, results, p, cl)


# ~~~~~~~~~~


## Appendix A.10: Assessment of functional form

# to run binning estimator, need to convert categorical controls into factors
data$educ3_fac <- as.factor(data$educ3)
data$inc3_fac <- as.factor(data$inc3)
data$pid7_fac <- as.factor(data$pid7)
data$race_fac <- as.factor(data$race)
data$ideo5_fac <- as.factor(data$ideo5)

# fit binning estimator using logged deaths as measure of local incidence
fit <- interflex(estimator = "binning",
                 data = as.data.frame(data),
                 Y = "under", D = "covid_nonpartisan_estimate", X = "log_deaths", 
                 Z = c("week", "period_1", "period_2", "period_3", "unemployment", "beds_per_10k", "rural_urban_continuum", "bachelors", 
                       "median_household_income", "rep_share", "age", "female", "educ3_fac", "inc3_fac", "pid7_fac", "race_fac", "ideo5_fac", "black", "hispanic"),
                 vcov.type = "robust", 
                 weights = "weight",
                 na.rm = T)

# plot
p <- plot(fit, xlab = "Logged COVID-19 deaths per 10,000 residents", ylab = "Marginal effect of media coverage on under-estimations")

# save
ggsave(filename = "Figure A.7.png", plot = p, device = "png", width = 18, height = 11, units = "in", bg = "white")

# fit binning estimator using logged cases as measure of local incidence
fit <- interflex(estimator = "binning",
                 data = as.data.frame(data),
                 Y = "under", D = "covid_nonpartisan_estimate", X = "log_cases", 
                 Z = c("week", "period_1", "period_2", "period_3", "unemployment", "beds_per_10k", "rural_urban_continuum", "bachelors", 
                       "median_household_income", "rep_share", "age", "female", "educ3_fac", "inc3_fac", "pid7_fac", "race_fac", "ideo5_fac", "black", "hispanic"),
                 vcov.type = "robust", 
                 weights = "weight",
                 na.rm = T)

# plot
p <- plot(fit, xlab = "Logged COVID-19 cases per 10,000 residents", ylab = "Marginal effect of media coverage on under-estimations")

# save
ggsave(filename = "Figure A.8.png", plot = p, device = "png", width = 18, height = 11, units = "in", bg = "white"); rm(p, fit)


# ~~~~~~~~~~


## Appendix A.11: Sample stability

# number of iterations
num_iter <- 1000

# extract unique county codes
unique_fips <- unique(data$FIPS)

# initialize objects to store results
est_a <- est_b <- est_c <- est_d <- numeric(num_iter)
p_a <- p_b <- p_c <- p_d <- numeric(num_iter)

# create function to run one iteration
run_iteration <- function(i){
  
  # set seed for reproducability
  set.seed(i)
  
  # extract random sample of 95% of counties
  counties <- sample(unique_fips, size = 0.95 * length(unique_fips), replace = FALSE)
  
  # survey design object
  d_sub <- svydesign(ids = ~FIPS, weights = ~weight, data = data %>% filter(FIPS %in% counties))
  
  # regressions
  fit_a <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                    log_cases*covid_nonpartisan_estimate + 
                    unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                    age + female + educ3 + race + inc3 + pid7 + ideo5,, 
                  design = d_sub, family = quasibinomial(link = "logit"), rescale = TRUE)
  
  fit_b <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                    log_deaths*covid_nonpartisan_estimate + 
                    unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic +
                    age + female + educ3 + race + inc3 + pid7 + ideo5, 
                  design = d_sub, family = quasibinomial(link = "logit"), rescale = TRUE)
  
  fit_c <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                    log_cases*covid_nonpartisan_estimate + 
                    unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                    age + female + educ3 + race + inc3 + pid7 + ideo5, 
                  design = d_sub, family = quasibinomial(link = "logit"), rescale = TRUE)
  
  fit_d <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                    log_deaths*covid_nonpartisan_estimate + 
                    unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                    age + female + educ3 + race + inc3 + pid7 + ideo5, 
                  design = d_sub, family = quasibinomial(link = "logit"), rescale = TRUE)
  
  # store results
  results <- c(
    est_a = fit_a$coefficients["log_cases:covid_nonpartisan_estimate"],
    est_b = fit_b$coefficients["log_deaths:covid_nonpartisan_estimate"],
    est_c = fit_c$coefficients["log_cases:covid_nonpartisan_estimate"],
    est_d = fit_d$coefficients["log_deaths:covid_nonpartisan_estimate"],
    p_a = summary(fit_a)$coefficients["log_cases:covid_nonpartisan_estimate", "Pr(>|t|)"],
    p_b = summary(fit_b)$coefficients["log_deaths:covid_nonpartisan_estimate", "Pr(>|t|)"],
    p_c = summary(fit_c)$coefficients["log_cases:covid_nonpartisan_estimate", "Pr(>|t|)"],
    p_d = summary(fit_d)$coefficients["log_deaths:covid_nonpartisan_estimate", "Pr(>|t|)"]
  )
  
  # return results
  return(results)
  
}

# set up parallel computing
cl <- makeCluster(detectCores() - 1)
clusterEvalQ(cl, {
  library(survey); library(dplyr); library(tidyverse)
})
clusterExport(cl, list("unique_fips", "data", "run_iteration"))

# run iterations in parallel | this takes <5 minutes when run in parallel
results <- parLapply(cl, 1:num_iter, run_iteration)

# stop the cluster
stopCluster(cl)

# extract results into vectors
for (i in 1:num_iter) {
  est_a[i] <- results[[i]]["est_a.log_cases:covid_nonpartisan_estimate"]
  est_b[i] <- results[[i]]["est_b.log_deaths:covid_nonpartisan_estimate"]
  est_c[i] <- results[[i]]["est_c.log_cases:covid_nonpartisan_estimate"]
  est_d[i] <- results[[i]]["est_d.log_deaths:covid_nonpartisan_estimate"]
  p_a[i] <- results[[i]]["p_a"]
  p_b[i] <- results[[i]]["p_b"]
  p_c[i] <- results[[i]]["p_c"]
  p_d[i] <- results[[i]]["p_d"]
}; rm(i)

# combine into one data frame for visualizing
df <- as.data.frame(cbind(c(est_a, est_b, est_c, est_d), 
                          c(p_a, p_b, p_c, p_d), 
                          c(rep("Underestimation", 2*num_iter), rep("Overestimation", 2*num_iter)),
                          c(rep(c(rep("Logged COVID-19 cases", num_iter), rep("Logged COVID-19 deaths", num_iter)),2))))
colnames(df) <- c("estimate", "p", "DV", "IV")
df$estimate <- as.numeric(df$estimate); df$p <- as.numeric(df$p)
df$p_bin <- ifelse(df$p < 0.05, "p value < 0.05", "p value >= 0.05")
df$num <- rep(1:1000,4); rm(p_a, p_b, p_c, p_d, est_a, est_b, est_c, est_d, num_iter, unique_fips, cl)

# plot
p <- ggplot(df, aes(x = num, y = estimate, group = p_bin)) + 
  geom_point(aes(color = p_bin)) + 
  theme_bw() + 
  facet_grid(cols = vars(DV), rows = vars(IV)) + 
  scale_color_manual(values = c("black", "red")) + 
  xlab("") + 
  theme(text=element_text(size=20),
        legend.position = "bottom",
        legend.title=element_blank())

# save
ggsave(filename = "Figure A.9.png", plot = p, device = "png", width = 18, height = 15, units = "in", bg = "white")

# generate and save relevant numbers as txt file
writeLines(paste(paste("Average estimate (underestimations, cases):", round(mean(df[df$DV == "Underestimation" & df$IV == "Logged COVID-19 cases",]$estimate),3)), 
                 paste("Average estimate (underestimations, deaths):", round(mean(df[df$DV == "Underestimation" & df$IV == "Logged COVID-19 deaths",]$estimate),3)), 
                 paste("Proportion of estimates (underestimations, cases) that are negative and statistically significant:", round(sum(df[df$DV == "Underestimation" & df$IV == "Logged COVID-19 deaths",]$estimate < 0 & df[df$DV == "Underestimation" & df$IV == "Logged COVID-19 cases",]$p < 0.05)/1000, 3)), 
                 paste("Proportion of estimates (underestimations, deaths) that are negative and statistically significant:", round(sum(df[df$DV == "Underestimation" & df$IV == "Logged COVID-19 deaths",]$estimate < 0 & df[df$DV == "Underestimation" & df$IV == "Logged COVID-19 deaths",]$p < 0.05)/1000, 3)), sep = "\n"), "Appendix A.11.txt")

rm(run_iteration, p, df, results)


# ~~~~~~~~~~


## Appendix A.12: Alternative specifications of media attention

# create data frame to store output
tab <- data.frame()

# under ~ deaths
fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_deaths*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimation", "1 week", summary(fit)$coefficients["log_deaths:covid_nonpartisan_estimate", "Estimate"],
                        summary(fit)$coefficients["log_deaths:covid_nonpartisan_estimate", "Std. Error"],
                        summary(fit)$coefficients["log_deaths:covid_nonpartisan_estimate", "Pr(>|t|)"]))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_deaths*covid_2weeks_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimation", "2 weeks", summary(fit)$coefficients["log_deaths:covid_2weeks_estimate", "Estimate"],
                        summary(fit)$coefficients["log_deaths:covid_2weeks_estimate", "Std. Error"],
                        summary(fit)$coefficients["log_deaths:covid_2weeks_estimate", "Pr(>|t|)"]))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_deaths*covid_1month_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("underestimation", "1 month", summary(fit)$coefficients["log_deaths:covid_1month_estimate", "Estimate"],
                        summary(fit)$coefficients["log_deaths:covid_1month_estimate", "Std. Error"],
                        summary(fit)$coefficients["log_deaths:covid_1month_estimate", "Pr(>|t|)"]))

# over ~ deaths
fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_deaths*covid_nonpartisan_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimation", "1 week", summary(fit)$coefficients["log_deaths:covid_nonpartisan_estimate", "Estimate"],
                        summary(fit)$coefficients["log_deaths:covid_nonpartisan_estimate", "Std. Error"],
                        summary(fit)$coefficients["log_deaths:covid_nonpartisan_estimate", "Pr(>|t|)"]))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_deaths*covid_2weeks_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimation", "2 weeks", summary(fit)$coefficients["log_deaths:covid_2weeks_estimate", "Estimate"],
                        summary(fit)$coefficients["log_deaths:covid_2weeks_estimate", "Std. Error"],
                        summary(fit)$coefficients["log_deaths:covid_2weeks_estimate", "Pr(>|t|)"]))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_deaths*covid_1month_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("overestimation", "1 month", summary(fit)$coefficients["log_deaths:covid_1month_estimate", "Estimate"],
                        summary(fit)$coefficients["log_deaths:covid_1month_estimate", "Std. Error"],
                        summary(fit)$coefficients["log_deaths:covid_1month_estimate", "Pr(>|t|)"]))

# regression table
names(tab) <- c("DV", "media", "estimate", "se", "p value")
tab$estimate <- as.numeric(tab$estimate)
tab$`p value` <- as.numeric(tab$`p value`)
tab$se <- as.numeric(tab$se)

# generate and save Table A.6
writeLines(stargazer(tab, summary = F), "Table A.6.txt"); rm(tab, fit)


# ~~~~~~~~~~


## Appendix A.13: Heterogeneity by educational attainment

# convert education to factor
data$educ3 <- as.factor(data$educ3) %>%
  relevel(ref = "Some or 2-year college") %>%
  relevel(ref = "High school or less")

# run triple-interaction regressions
fit1t <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                  log_deaths*covid_nonpartisan_estimate*educ3 + 
                  unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                  age + female + pid7 + race + inc3 + ideo5, 
                design = d, family = quasibinomial(link = "logit"), rescale = T)

fit2t <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                  log_deaths_30*covid_nonpartisan_estimate*educ3 + 
                  unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                  age + female + pid7 + race + inc3 + ideo5, 
                design = d, family = quasibinomial(link = "logit"), rescale = T)

# generate marginal effects plots
p1 <- plot_model(fit1t, type = "pred", terms = c("log_deaths", "covid_nonpartisan_estimate [0.06260057, 0.2431676]", "educ3")) + 
  theme_bw() + 
  labs(x = "Logged cumulative per-capita COVID-19 deaths", y = "Underestimating", title = "", color = "Media salience") + 
  scale_color_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) + 
  scale_fill_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) +
  theme(legend.position = "bottom",
        text=element_text(size=20))

p2 <- plot_model(fit2t, type = "pred", terms = c("log_deaths_30", "covid_nonpartisan_estimate [0.06260057, 0.2431676]", "educ3")) + 
  theme_bw() + 
  labs(x = "Logged per-capita COVID-19 deaths in last 30 days", y = "Underestimating", title = "", color = "Media salience") +
  scale_color_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) + 
  scale_fill_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) +
  theme(legend.position = "bottom",
        text=element_text(size=20))

p <- ggarrange(p1, p2, ncol = 1, nrow = 2, common.legend = T, legend = "bottom"); rm(p1, p2, fit1t, fit2t)

# save Figure A.11
ggsave(filename = "Figure A.11.png", plot = p, device = "png", width = 18, height = 15, units = "in", bg = "white"); rm(p)

# calculate and save as txt file relevant number
writeLines(paste(paste("Proportion preferring national news source (bachelors):", round(mean(data[!is.na(data$econQ2) & data$educ3 == "Bachelor's degree or more",]$econQ2 %in% c(1,2,4)),3)),
                 paste("Proportion preferring national news source (high school):", round(mean(data[!is.na(data$econQ2) & data$educ3 == "High school or less",]$econQ2 %in% c(1,2,4)),3)), sep = "\n"), "Appendix A.13.txt")


# ~~~~~~~~~~


## Appendix A.14: Overestimations and full scale

# run triple-interaction regressions
fit1t <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                  log_deaths*covid_nonpartisan_estimate*party + 
                  unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                  age + female + educ3 + race + inc3 + ideo5, 
                design = d, family = quasibinomial(link = "logit"), rescale = T)

fit2t <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                  log_deaths_30*covid_nonpartisan_estimate*party + 
                  unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                  age + female + educ3 + race + inc3 + ideo5, 
                design = d, family = quasibinomial(link = "logit"), rescale = T)

# generate marginal effects plots
p1 <- plot_model(fit1t, type = "pred", terms = c("log_deaths", "covid_nonpartisan_estimate [0.06260057, 0.2431676]", "party")) + 
  theme_bw() + 
  labs(x = "Logged cumulative per-capita COVID-19 deaths", y = "Overestimating", title = "", color = "Media salience") + 
  scale_color_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) + 
  scale_fill_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) +
  theme(legend.position = "bottom",
        text=element_text(size=20))

p2 <- plot_model(fit2t, type = "pred", terms = c("log_deaths_30", "covid_nonpartisan_estimate [0.06260057, 0.2431676]", "party")) + 
  theme_bw() + 
  labs(x = "Logged per-capita COVID-19 deaths in last 30 days", y = "Overestimating", title = "", color = "Media salience") +
  scale_color_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) + 
  scale_fill_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) +
  theme(legend.position = "bottom",
        text=element_text(size=20))

p <- ggarrange(p1, p2, ncol = 1, nrow = 2, common.legend = T, legend = "bottom"); rm(p1, p2, fit1t, fit2t)

# save Figure A.12
ggsave(filename = "Figure A.12.png", plot = p, device = "png", width = 18, height = 15, units = "in", bg = "white"); rm(p)


# ~~~~~~~~~~


## Appendix A.15: Partisan news

# save number of respondents indicating Fox and MSNBC as their preferred source of news
writeLines(paste(paste("Fox:", sum(data$cablenews == 2, na.rm = T)), paste("MSNBC:", sum(data$cablenews == 3, na.rm = T)), sep = "\n"), "Appendix A.15.txt")


# (1) Fox

# load hand-coding
setwd("../data/raw")
hand_coding <- read.csv("hand_coding.csv") %>% dplyr::select(c(week, n_fox_downplay))

# merge into data
data <- left_join(data, hand_coding); rm(hand_coding)

# create weighted Fox attention variable by number of shows that week that downplayed
data$FOX_covid_weighted <- data$FOX_covid_estimate*data$n_fox_downplay

# create survey design object with sample limited to self-reported Fox viewers
d_fox <- svydesign(ids = ~FIPS, weights = ~weight, data = data %>% filter(cablenews == 2))

# create data frame to store output
tab <- data.frame()

# regressions

# under
fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_cases*FOX_covid_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_fox, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("under", "logged cases", "FOX", round(summary(fit)$coefficients["log_cases:FOX_covid_estimate", "Estimate"],3), 
                        round(summary(fit)$coefficients["log_cases:FOX_covid_estimate", "Std. Error"],3),
                        round(summary(fit)$coefficients["log_cases:FOX_covid_estimate", "Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_deaths*FOX_covid_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_fox, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("under", "logged deaths", "FOX", round(summary(fit)$coefficients["log_deaths:FOX_covid_estimate", "Estimate"],3), 
                        round(summary(fit)$coefficients["log_deaths:FOX_covid_estimate", "Std. Error"],3),
                        round(summary(fit)$coefficients["log_deaths:FOX_covid_estimate", "Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_cases*FOX_covid_weighted + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_fox, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("under", "logged cases", "FOX weighted", round(summary(fit)$coefficients["log_cases:FOX_covid_weighted", "Estimate"],3), 
                        round(summary(fit)$coefficients["log_cases:FOX_covid_weighted", "Std. Error"],3),
                        round(summary(fit)$coefficients["log_cases:FOX_covid_weighted", "Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_deaths*FOX_covid_weighted + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_fox, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("under", "logged deaths", "FOX weighted", round(summary(fit)$coefficients["log_deaths:FOX_covid_weighted", "Estimate"],3), 
                        round(summary(fit)$coefficients["log_deaths:FOX_covid_weighted", "Std. Error"],3),
                        round(summary(fit)$coefficients["log_deaths:FOX_covid_weighted", "Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                cases_per_10k*FOX_covid_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_fox, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("under", "cases", "FOX", round(summary(fit)$coefficients["cases_per_10k:FOX_covid_estimate", "Estimate"],3), 
                        round(summary(fit)$coefficients["cases_per_10k:FOX_covid_estimate", "Std. Error"],3),
                        round(summary(fit)$coefficients["cases_per_10k:FOX_covid_estimate", "Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                deaths_per_10k*FOX_covid_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_fox, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("under", "deaths", "FOX", round(summary(fit)$coefficients["deaths_per_10k:FOX_covid_estimate", "Estimate"],3), 
                        round(summary(fit)$coefficients["deaths_per_10k:FOX_covid_estimate", "Std. Error"],3),
                        round(summary(fit)$coefficients["deaths_per_10k:FOX_covid_estimate", "Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                cases_per_10k*FOX_covid_weighted + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_fox, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("under", "cases", "FOX weighted", round(summary(fit)$coefficients["cases_per_10k:FOX_covid_weighted", "Estimate"],3), 
                        round(summary(fit)$coefficients["cases_per_10k:FOX_covid_weighted", "Std. Error"],3),
                        round(summary(fit)$coefficients["cases_per_10k:FOX_covid_weighted", "Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                deaths_per_10k*FOX_covid_weighted + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_fox, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("under", "deaths", "FOX weighted", round(summary(fit)$coefficients["deaths_per_10k:FOX_covid_weighted", "Estimate"],3), 
                        round(summary(fit)$coefficients["deaths_per_10k:FOX_covid_weighted", "Std. Error"],3),
                        round(summary(fit)$coefficients["deaths_per_10k:FOX_covid_weighted", "Pr(>|t|)"],3)))

# over
fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_cases*FOX_covid_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_fox, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("over", "logged cases", "FOX", round(summary(fit)$coefficients["log_cases:FOX_covid_estimate", "Estimate"],3), 
                        round(summary(fit)$coefficients["log_cases:FOX_covid_estimate", "Std. Error"],3),
                        round(summary(fit)$coefficients["log_cases:FOX_covid_estimate", "Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_deaths*FOX_covid_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_fox, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("over", "logged deaths", "FOX", round(summary(fit)$coefficients["log_deaths:FOX_covid_estimate", "Estimate"],3), 
                        round(summary(fit)$coefficients["log_deaths:FOX_covid_estimate", "Std. Error"],3),
                        round(summary(fit)$coefficients["log_deaths:FOX_covid_estimate", "Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_cases*FOX_covid_weighted + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_fox, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("over", "logged cases", "FOX weighted", round(summary(fit)$coefficients["log_cases:FOX_covid_weighted", "Estimate"],3), 
                        round(summary(fit)$coefficients["log_cases:FOX_covid_weighted", "Std. Error"],3),
                        round(summary(fit)$coefficients["log_cases:FOX_covid_weighted", "Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_deaths*FOX_covid_weighted + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_fox, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("over", "logged deaths", "FOX weighted", round(summary(fit)$coefficients["log_deaths:FOX_covid_weighted", "Estimate"],3), 
                        round(summary(fit)$coefficients["log_deaths:FOX_covid_weighted", "Std. Error"],3),
                        round(summary(fit)$coefficients["log_deaths:FOX_covid_weighted", "Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                cases_per_10k*FOX_covid_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_fox, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("over", "cases", "FOX", round(summary(fit)$coefficients["cases_per_10k:FOX_covid_estimate", "Estimate"],3), 
                        round(summary(fit)$coefficients["cases_per_10k:FOX_covid_estimate", "Std. Error"],3),
                        round(summary(fit)$coefficients["cases_per_10k:FOX_covid_estimate", "Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                deaths_per_10k*FOX_covid_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_fox, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("over", "deaths", "FOX", round(summary(fit)$coefficients["deaths_per_10k:FOX_covid_estimate", "Estimate"],3), 
                        round(summary(fit)$coefficients["deaths_per_10k:FOX_covid_estimate", "Std. Error"],3),
                        round(summary(fit)$coefficients["deaths_per_10k:FOX_covid_estimate", "Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                cases_per_10k*FOX_covid_weighted + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_fox, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("over", "cases", "FOX weighted", round(summary(fit)$coefficients["cases_per_10k:FOX_covid_weighted", "Estimate"],3), 
                        round(summary(fit)$coefficients["cases_per_10k:FOX_covid_weighted", "Std. Error"],3),
                        round(summary(fit)$coefficients["cases_per_10k:FOX_covid_weighted", "Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                deaths_per_10k*FOX_covid_weighted + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_fox, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("over", "deaths", "FOX weighted", round(summary(fit)$coefficients["deaths_per_10k:FOX_covid_weighted", "Estimate"],3), 
                        round(summary(fit)$coefficients["deaths_per_10k:FOX_covid_weighted", "Std. Error"],3),
                        round(summary(fit)$coefficients["deaths_per_10k:FOX_covid_weighted", "Pr(>|t|)"],3)))

# prep output
names(tab) <- c("DV", "covid incidence", "media", "estimate", "se", "p value")
tab$estimate <- as.numeric(tab$estimate)
tab$`p value` <- as.numeric(tab$`p value`)
tab$se <- as.numeric(tab$se)

# generate and save Table A.7
setwd("../../output")
writeLines(stargazer(tab, summary = F), "Table A.7.txt"); rm(tab, fit, d_fox)


# (2) MSNBC

# create survey design object with sample limited to self-reported
d_msnbc <- svydesign(ids = ~FIPS, weights = ~weight, data = data %>% filter(cablenews == 3))

# create data frame to store output
tab <- data.frame()

# regressions

# under
fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_cases*MSNBC_covid_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_msnbc, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("under", "logged cases", "MSNBC", round(summary(fit)$coefficients["log_cases:MSNBC_covid_estimate", "Estimate"],3), 
                        round(summary(fit)$coefficients["log_cases:MSNBC_covid_estimate", "Std. Error"],3),
                        round(summary(fit)$coefficients["log_cases:MSNBC_covid_estimate", "Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                log_deaths*MSNBC_covid_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_msnbc, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("under", "logged deaths", "MSNBC", round(summary(fit)$coefficients["log_deaths:MSNBC_covid_estimate", "Estimate"],3), 
                        round(summary(fit)$coefficients["log_deaths:MSNBC_covid_estimate", "Std. Error"],3),
                        round(summary(fit)$coefficients["log_deaths:MSNBC_covid_estimate", "Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                cases_per_10k*MSNBC_covid_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_msnbc, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("under", "cases", "MSNBC", round(summary(fit)$coefficients["cases_per_10k:MSNBC_covid_estimate", "Estimate"],3), 
                        round(summary(fit)$coefficients["cases_per_10k:MSNBC_covid_estimate", "Std. Error"],3),
                        round(summary(fit)$coefficients["cases_per_10k:MSNBC_covid_estimate", "Pr(>|t|)"],3)))

fit <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                deaths_per_10k*MSNBC_covid_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_msnbc, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("under", "deaths", "MSNBC", round(summary(fit)$coefficients["deaths_per_10k:MSNBC_covid_estimate", "Estimate"],3), 
                        round(summary(fit)$coefficients["deaths_per_10k:MSNBC_covid_estimate", "Std. Error"],3),
                        round(summary(fit)$coefficients["deaths_per_10k:MSNBC_covid_estimate", "Pr(>|t|)"],3)))

# over
fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_cases*MSNBC_covid_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_msnbc, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("over", "logged cases", "MSNBC", round(summary(fit)$coefficients["log_cases:MSNBC_covid_estimate", "Estimate"],3), 
                        round(summary(fit)$coefficients["log_cases:MSNBC_covid_estimate", "Std. Error"],3),
                        round(summary(fit)$coefficients["log_cases:MSNBC_covid_estimate", "Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                log_deaths*MSNBC_covid_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_msnbc, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("over", "logged deaths", "MSNBC", round(summary(fit)$coefficients["log_deaths:MSNBC_covid_estimate", "Estimate"],3), 
                        round(summary(fit)$coefficients["log_deaths:MSNBC_covid_estimate", "Std. Error"],3),
                        round(summary(fit)$coefficients["log_deaths:MSNBC_covid_estimate", "Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                cases_per_10k*MSNBC_covid_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_msnbc, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("over", "cases", "MSNBC", round(summary(fit)$coefficients["cases_per_10k:MSNBC_covid_estimate", "Estimate"],3), 
                        round(summary(fit)$coefficients["cases_per_10k:MSNBC_covid_estimate", "Std. Error"],3),
                        round(summary(fit)$coefficients["cases_per_10k:MSNBC_covid_estimate", "Pr(>|t|)"],3)))

fit <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                deaths_per_10k*MSNBC_covid_estimate + 
                unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                age + female + educ3 + race + inc3 + pid7 + ideo5, 
              design = d_msnbc, family = quasibinomial(link = "logit"), rescale = T)

tab <- rbind(tab, cbind("over", "deaths", "MSNBC", round(summary(fit)$coefficients["deaths_per_10k:MSNBC_covid_estimate", "Estimate"],3), 
                        round(summary(fit)$coefficients["deaths_per_10k:MSNBC_covid_estimate", "Std. Error"],3),
                        round(summary(fit)$coefficients["deaths_per_10k:MSNBC_covid_estimate", "Pr(>|t|)"],3)))

# prep output
names(tab) <- c("DV", "covid incidence", "media", "estimate", "se", "p value")
tab$estimate <- as.numeric(tab$estimate)
tab$`p value` <- as.numeric(tab$`p value`)
tab$se <- as.numeric(tab$se)

#  generate and save Table A.7
writeLines(stargazer(tab, summary = F), "Table A.8.txt"); rm(tab, fit, d_msnbc)


# ~~~~~~~~~~


## Appendix A.16: Local vs. national news on masking

# load Neumann et al. masking data
setwd("../data/raw")
load("mask_stories_per_day.rdata")
daily_stories <- daily_stories %>% mutate(story_smoothed = story_smoothed/(758*4.524725)) # normalize count to per-program values

# Why 758*4.524725? They have 758 stations in their dataset and 2,267,059 documents (i.e. separate broadcasts) across 661 days (February 1, 2020, to November 23, 2021).
# 2,267,059/(758*661) = 4.524725, meaning they have about 4.5 broadcasts per station per day. To make this most comparable to ABC's "World News Tonight", I divide 
# by 758*4.524725


## Quantify mask coverage in ABC and USA Today following Neumann et al. method

# load preprocessed transcript text
setwd("../processed")
text <- readRDS("text_preprocessed.rds") %>% 
  mutate(date = as.Date(date, format = "%m/%d/%Y")) %>% 
  filter(source %in% c("ABC", "USA Today")) %>% 
  dplyr::select(-program)

# filter dates
daily_stories <- daily_stories %>% filter(date <= max(text$date))
text <- text %>% filter(date >= min(daily_stories$date))

# store patterns for matched words and phrases to exlcude
include_pat <- regex("^(mask.*|face cover.*|face shield.*)$", ignore_case = TRUE)
exclude_pat <- regex("^(gas mask(s)?|mask their pain|mask his pain|mask her pain)$", ignore_case = TRUE)

# create function to build merged windows, given hit indices and total length
build_windows <- function(hits, n_words, context = 50) {
  if (length(hits) == 0) return(list())
  
  # start with one window per hit
  windows <- map(hits, ~c(start = .x, end = .x))
  
  # expand each window to include any hits within ±context, iterating to closure
  windows <- map(windows, function(win) {
    repeat {
      # grab hits within current window and +/- 50 tokens
      in_range <- hits[hits >= win["start"] - context & hits <= win["end"] + context]
      new_start <- min(in_range)
      new_end   <- max(in_range)
      # if nothing new, break
      if (new_start == win["start"] && new_end == win["end"]) break
      # else update and repeat
      win["start"] <- new_start
      win["end"]   <- new_end
    }
    # finally extend by context, clipped to document bounds
    win["start"] <- max(1, win["start"] - context)
    win["end"]   <- min(n_words, win["end"] + context)
    win
  })
  
  # now merge any overlapping windows, sorting by start
  windows <- windows[order(map_dbl(windows, "start"))]
  merged <- list(windows[[1]])
  for (w in windows[-1]) {
    last <- merged[[length(merged)]]
    # if overlapping or contiguous
    if (w["start"] <= last["end"]) {
      # merge by extending end
      merged[[length(merged)]]["end"] <- max(last["end"], w["end"])
    } else {
      merged <- append(merged, list(w))
    }
  }
  merged
}

# tokenize text
results <- text %>%
  rowwise() %>%
  mutate(words = list(str_split(body, "\\s+")[[1]])) %>%
  ungroup() %>%
  mutate(
    n_words = lengths(words))

# locate matched words
results <- results %>%
  mutate(hits = map(words, ~ which(str_detect(.x, include_pat) & !str_detect(.x, exclude_pat))))

# construct windows of +/- 50 words around hits using custom function
results <- results %>%
  mutate(windows = map2(hits, n_words, ~ build_windows(.x, .y, context = 50)))

# extract matching text
results <- results %>%
  mutate(snippets = map2(words, windows, ~ map_chr(.y, function(win){
    rng <- win["start"]:win["end"]
    str_c(.x[rng], collapse = " ")
  }))) %>%
  dplyr::select(c(date, source, snippets)) %>%
  unnest_longer(snippets)

# extract number of snippets per source-day
results <- results %>%
  group_by(date, source) %>%
  summarize(n = n()) %>%
  ungroup()

# add 0s for all missing dates
days <- as.Date("2020-02-04", format = "%Y-%m-%d"):as.Date("2020-12-31", format = "%Y-%m-%d")
days <- as.Date(days)
results <- results %>%
  complete(source, date = days, fill = list(n = 0))

# create 7-day rolling averages
results <- results %>%
  arrange(source, date) %>%
  group_by(source) %>%
  mutate(n_7day = rollapply(data = n, width = 7, FUN = mean, align = "right", partial = T)) %>%
  ungroup(); rm(exclude_pat, include_pat)

# Merge data, examine correlations, and plot

# linearly interpolate values for 2 missing dates in Neumann et al. data
to_add <- as.data.frame(approx(daily_stories$date, daily_stories$story_smoothed, xout = days[which(!(days %in% daily_stories$date))]))
colnames(to_add) <- c("date", "story_smoothed")
daily_stories <- bind_rows(daily_stories, to_add) %>% arrange(date); rm(to_add, days)

# merge
results <- results %>% dplyr::select(-n)
daily_stories <- daily_stories %>% mutate(source = "Local TV") %>% rename(n_7day = story_smoothed)
results <- bind_rows(results, daily_stories); rm(daily_stories)

# calculate and save correlations for in-text number on bottom of pg. 12
setwd("../../output")
writeLines(paste(paste("Correlation ABC-local news:", round(cor(results[results$source == "ABC",]$n_7day, results[results$source == "Local TV",]$n_7day), 3)),
                 paste("Correlation USA Today-local news:", round(cor(results[results$source == "Local TV",]$n_7day, results[results$source == "USA Today",]$n_7day), 3)), sep = "\n"), "pg12_correlation_masks.txt")


# plot and save Figure A.14
p <- results %>%
  ggplot(aes(x = date, y = n_7day, group = source)) +
  geom_line(aes(color = source), linewidth = 1.4) + 
  theme_bw() + 
  labs(x = "", color = "", y = "7-day rolling average of per-program mask snippets") + 
  scale_x_continuous(breaks = as.Date(c("2020-02-04", "2020-03-01", "2020-04-01", "2020-05-01", "2020-06-01", "2020-07-01", "2020-08-01", "2020-09-01", "2020-10-01", "2020-11-01", "2020-12-01")),
                     labels = c("Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) + 
  theme(legend.position = "bottom",
        text = element_text(size = 18))

ggsave(filename = "Figure A.14.png", plot = p, device = "png", width = 18, height = 11, units = "in", bg = "white"); rm(p, results, build_windows, text)


# ~~~~~~~~~~


## Appendix A.17: Robustness to probabilistic assignment of ZIPs to counties

# clear workspace
rm(list = ls()); gc()

# set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))


# Step (1): start recreating process from 3_replication_survey_preprocessing, avoiding 
# anything that has to be merged by county

# load survey data
setwd("../data/raw")
data <- read.csv("covid_19_economist_stacked_replication.csv")

# gender
data <- data %>%
  mutate(gender = if_else(gender == 2, 1, 0)) %>%
  rename(female = gender)

# race
data$race <- ifelse(data$race == 1, "White", 
                    ifelse(data$race == 2, "Black", 
                           ifelse(data$race == 3, "Hispanic", "Other"))) %>% as.factor() %>%
  relevel(data$race, ref = "Hispanic") %>%
  relevel(data$race, ref = "Black") %>%
  relevel(data$race, ref = "White")

# education
data$educ3 <- ifelse(data$educ == 1 | data$educ == 2, "High school or less", 
                     ifelse(data$educ == 3 | data$educ == 4, "Some or 2-year college", 
                            "Bachelor's degree or more")) %>% as.factor() %>%
  relevel(data$educ3, ref = "Some or 2-year college") %>%
  relevel(data$educ3, ref = "High school or less"); data <- data %>% dplyr::select(-educ)

# income
data$inc3 <- ifelse(data$faminc <= 4, "Less than $40,000", 
                    ifelse(data$faminc >= 5 & data$faminc <= 8, "$40,000 - $79,999", 
                           ifelse(data$faminc >= 9 & data$faminc <= 16, "More than $80,000", NA))) %>% as.factor() %>%
  relevel(data$inc3, ref = "$40,000 - $79,999") %>%
  relevel(data$inc3, ref = "Less than $40,000"); data <- data %>% dplyr::select(-faminc)

# ideology
data$ideo5 <- ifelse(data$ideo5 == 1, "Very liberal", 
                     ifelse(data$ideo5 == 2, "Liberal", 
                            ifelse(data$ideo5 == 3, "Moderate", 
                                   ifelse(data$ideo5 == 4, "Conservative", 
                                          ifelse(data$ideo5 == 5, "Very conservative", "Not sure")))))

# load and merge total COVID deaths by day from CDC
usdeaths <- read.csv("data_table_for_total_deaths__united_states.csv") %>%
  rename(surveydate = Date, total_deaths = Total.Deaths)
data <- left_join(data, usdeaths); rm(usdeaths)

# partisan ID
data$pid7 <- ifelse(data$pid7 == 1, "Strong Democrat", 
                    ifelse(data$pid7 == 2, "Not very strong Democrat", 
                           ifelse(data$pid7 == 3, "Lean Democrat", 
                                  ifelse(data$pid7 == 4 | data$pid7 == 8, "Independent", 
                                         ifelse(data$pid7 == 5, "Lean Republican", 
                                                ifelse(data$pid7 == 6, "Not very strong Republican", 
                                                       ifelse(data$pid7 == 7, "Strong Republican", NA)))))))

# create trichotomous party variable
data$party <- ifelse(data$pid7 %in% c("Strong Republican", "Not very strong Republican", "Lean Republican"), "Republican", 
                     ifelse(data$pid7 == "Independent", "Independent", "Democrat"))

# convert date variable to Date
data$surveydate <- as.Date(data$surveydate, tryFormats = "%m/%d/%Y")

# construct misperception variables

# extract unique survey dates
weeks <- unique(data$surveydate)

# regardless of survey date, anyone who responded (6) to covid_deaths is marked as overestimating
data$over <- ifelse(data$covid_deaths == 6, 1, 0)

# initialize underestimation variables 
data$under <- NA

# loop over weeks
for(w in weeks){
  
  # extract number of deaths by that survey date
  deaths <- unique(data[data$surveydate == w,]$total_deaths)
  
  # if deaths <= 100k, no one under
  if(deaths <= 100000){
    
    data[data$surveydate == w,]$under <- 0
    
    # if deaths > 100k & <= 150k, under = anyone who says (1)  
  }else if(deaths > 100000 & deaths <= 150000){
    
    data[data$surveydate == w,]$under <- ifelse(data[data$surveydate == w,]$covid_deaths == 1, 1, 0)
    
    # if deaths > 150k & <= 200k, under = anyone who says (1) or (2)  
  }else if(deaths > 150000 & deaths <= 200000){
    
    data[data$surveydate == w,]$under <- ifelse(data[data$surveydate == w,]$covid_deaths %in% c(1, 2), 1, 0)
    
    # if deaths > 200k & <= 250k, under = anyone who says (1), (2), or (3)  
  }else if(deaths > 200000 & deaths <= 250000){
    
    data[data$surveydate == w,]$under <- ifelse(data[data$surveydate == w,]$covid_deaths %in% c(1, 2, 3), 1, 0)
    
    # if deaths > 250k, under = anyone who says (1), (2), (3), or (4)  
  }else{
    
    data[data$surveydate == w,]$under <- ifelse(data[data$surveydate == w,]$covid_deaths %in% c(1, 2, 3, 4), 1, 0)
    
  }
}; rm(w, deaths, weeks)

# fix NAs
data$under <- ifelse(is.na(data$covid_deaths), NA, data$under)

# merge in topic model

# load topic model
setwd("../processed")
topics <- readRDS("topic_model.rds")

# add week variable to main data for merging
data$week <- as.numeric(strftime(data$surveydate, format = "%V"))

# pivot topic model wider for merging, create one measure for each source, and average USA Today and ABC
topics <- topics %>% pivot_wider(names_from = source, values_from = estimate) %>%
  rename(ABC_covid_estimate = ABC, FOX_covid_estimate = Fox, MSNBC_covid_estimate = MSNBC, USA_covid_estimate = `USA Today`) %>%
  mutate(covid_nonpartisan_estimate = (ABC_covid_estimate+USA_covid_estimate)/2)

# lag topics by 1 week to merge based on coverage in the week before respondents' survey administration
topics$week <- topics$week-1

# make sure topics is arranged by week
topics <- topics %>% arrange(week)

# create ~two-week~ and ~month~ versions of nonpartisan press estimate
topics <- topics %>% 
  mutate(covid_2weeks_estimate = rollapply((ABC_covid_estimate + USA_covid_estimate)/2,
                                           width = 2,
                                           FUN = sum,
                                           align = "right", 
                                           fill = (ABC_covid_estimate + USA_covid_estimate)/2))
topics <- topics %>% 
  mutate(covid_1month_estimate = rollapply((ABC_covid_estimate + USA_covid_estimate)/2,
                                           width = 4,
                                           FUN = sum,
                                           align = "right", 
                                           fill = (ABC_covid_estimate + USA_covid_estimate)/2))

# merge into main data
data <- left_join(data, topics); rm(topics)

# load ZIP code to county crosswalk file
setwd("../raw")
crosswalk <- read.csv("ZIP_COUNTY_122020.csv")

# add 0s to ZIP codes that are fewer than 5 characters long | this is necessary for merging
data$zip_code <- ifelse(nchar(data$zip_code) == 5, data$zip_code, 
                        ifelse(nchar(data$zip_code) == 4, paste("0", data$zip_code, sep = ""),
                               ifelse(nchar(data$zip_code) == 3, paste("00", data$zip_code, sep = ""),
                                      ifelse(nchar(data$zip_code) == 2, paste("000", data$zip_code, sep = ""),
                                             ifelse(nchar(data$zip_code) == 1, paste("0000", data$zip_code, sep = ""), NA)))))

crosswalk$ZIP <- ifelse(nchar(crosswalk$ZIP) == 5, crosswalk$ZIP, 
                        ifelse(nchar(crosswalk$ZIP) == 4, paste("0", crosswalk$ZIP, sep = ""), paste("00", crosswalk$ZIP, sep = "")))

# add 0s to county FIPS codes that are fewer than 5 characters long | this is necessary for merging
crosswalk$COUNTY <- ifelse(nchar(crosswalk$COUNTY) == 4, paste("0", crosswalk$COUNTY, sep = ""), crosswalk$COUNTY)

# initialize county FIPS variable
data$FIPS <- NA


# Step (2): Repeating 250 times, probabilistically assign counties based on ZIP codes,
# merge in county-level data, and run regressions

# initialize output
output <- data.frame()

# pick 250 seeds at random
set.seed(12345)
seeds <- sample(1:100000, 250, replace = F)

# loop | takes 10 hours to run
for(i in 1:250){
  
  # save temporary data set
  df <- data
  
  
  ## Map ZIP codes to counties
  
  # assign ZIP codes probabilistically based on proportion of ZIP code addresses in each county
  
  # loop over respondents
  for(j in 1:nrow(df)){
    
    # if zip_code corresponds to only one county
    if(sum(crosswalk$ZIP == df[j,]$zip_code, na.rm = T) == 1){
      
      # set data$FIPS to its value from crosswalk
      df[j,]$FIPS <- crosswalk[crosswalk$ZIP == df[j,]$zip_code,]$COUNTY
      
      # if zip_code corresponds to multiple counties
    }else if(sum(crosswalk$ZIP == df[j,]$zip_code, na.rm = T) > 1){
      
      # assign data$FIPS randomly based on proportion of RES_RATIO
      set.seed(seeds[i])
      df[j,]$FIPS <- sample(x = crosswalk[crosswalk$ZIP == df[j,]$zip_code,]$COUNTY, 
                            size = 1, 
                            prob = crosswalk[crosswalk$ZIP == df[j,]$zip_code,]$RES_RATIO)
      
    }
  }; rm(j)
  
  df$FIPS <- as.numeric(df$FIPS)
  
  # drop participants with no ZIP
  df <- df %>% filter(!is.na(FIPS))
  
  
  ## Merge in COVID-19 cases and deaths
  
  # load Johns Hopkins COVID data
  deaths <- read.csv("time_series_covid19_deaths_US.csv") %>% 
    dplyr::select(-c(UID, iso2, iso3, code3, Country_Region, Lat, Long_, Combined_Key)) %>%
    rename(population = Population, county = Admin2, state = Province_State)
  
  # pivot datasets to long so date is in one column variable
  deaths <- pivot_longer(deaths, cols = starts_with("X"), names_to = "surveydate", values_to = "deaths")
  
  # remove observations with missing FIPS
  deaths <- deaths %>% filter(!is.na(FIPS))
  
  # convert date variable to Date to merge
  deaths$surveydate <- gsub("X", "", deaths$surveydate)
  deaths$surveydate <- as.Date(deaths$surveydate, tryFormats = "%m.%d.%Y")
  
  # create measure of cumulative deaths in the 30 days leading up to survey administration
  deaths <- deaths %>% group_by(FIPS) %>% mutate(deaths_30 = deaths - lag(deaths, 30, default = 0))
  
  # merge into survey data
  df <- left_join(df, deaths %>% dplyr::select(-c(county, state)), relationship = "many-to-one"); rm(deaths)
  
  
  ## Merge in county-level covariates
  
  # (1) Hospital beds
  
  # load and prep data
  beds <- read.csv("American_Hospital_Association_2019.csv") %>% 
    dplyr::select(-c(ID, MLOCSTCD, FCNTYCD, CNTYNAME, YEAR)) %>%
    rename(beds = HOSPBD, FIPS = FCOUNTY) %>%
    group_by(FIPS) %>%
    summarize(beds = sum(beds))
  
  # merge into survey data
  df <- left_join(df, beds); rm(beds)
  
  # replace any NAs with 0s
  df$beds <- ifelse(is.na(df$beds), 0, df$beds)
  
  
  # (2) Educational attainment
  
  # load and prep data
  education <- read.csv("Education.csv") %>%
    dplyr::select(c(FIPS.Code, Percent.of.adults.with.a.bachelor.s.degree.or.higher..2015.19)) %>%
    rename(FIPS = FIPS.Code, bachelors = Percent.of.adults.with.a.bachelor.s.degree.or.higher..2015.19)
  
  # merge into survey data
  df <- left_join(df, education); rm(education)
  
  
  # (3) Black and Hispanic population
  
  # load and prep data
  race <- read.csv("county_race.csv") %>%
    dplyr::select(-NAME) %>%
    mutate(GEO_ID = substr(GEO_ID, nchar(GEO_ID)-4, nchar(GEO_ID)), # fix FIPS code
           GEO_ID = as.numeric(GEO_ID)) %>%
    rename(FIPS = GEO_ID)
  
  # merge into survey data
  df <- left_join(df, race); rm(race)
  
  # convert population totals to proportions
  df <- df %>% 
    mutate(BLACK = BLACK/population,
           HISPANIC = HISPANIC/population) %>% 
    rename(black = BLACK, hispanic = HISPANIC)
  
  
  # (4) Median household income
  
  # load and prep data
  income <- read.csv("est19all.csv") %>%
    dplyr::select(c(FIPS, Median.Household.Income)) %>%
    rename(median_household_income = Median.Household.Income) %>%
    mutate(median_household_income = gsub("[[:punct:] ]+", "", median_household_income), # remove commas
           median_household_income = as.numeric(median_household_income)) # one tiny HI county has value '.' for privacy
  
  # merge into survey data
  df <- left_join(df, income); rm(income)
  
  
  # (5) Republican 2016 presidential vote share
  
  # load and prep data
  votes <- read.csv("countypres_2016.csv") %>% 
    filter(year == 2016 & candidate != "Other" & !is.na(FIPS)) %>% # subset to only include 2016 and to include only two-party vote share and to remove missing FIPS (e.g. for statewide write-ins)
    dplyr::select(-c(version, year, office, candidate))
  
  votes <- pivot_wider(votes, id_cols = FIPS, names_from = party, values_from = candidatevotes)
  votes$rep_share <- votes$republican/(votes$democrat + votes$republican)
  
  # merge into survey data
  df <- left_join(df, votes %>% dplyr::select(c(FIPS, rep_share))); rm(votes)
  
  
  # (6) Unemployment and (7) Rural-urban continuum
  
  # load and prep data
  econ <- read_excel("Unemployment.xlsx", sheet = "UnemploymentMedianIncome") %>%
    dplyr::select(c(FIPS_Code, Rural_Urban_Continuum_Code_2013, Unemployment_rate_2019)) %>%
    rename(FIPS = FIPS_Code, rural_urban_continuum = Rural_Urban_Continuum_Code_2013, unemployment = Unemployment_rate_2019) %>%
    mutate(FIPS = as.numeric(FIPS))
  
  # merge into survey data
  df <- left_join(df, econ); rm(econ)
  
  
  ## Prep data for regressions
  
  # drop observation with missing weight
  df <- df %>% filter(!is.na(weight))
  
  # create per-capita hospital beds variable
  df$beds_per_10k <- df$beds/(df$population/10000)
  
  # create per-capita COVID fatality variables
  df$deaths_per_10k <- df$deaths/(df$population/10000)
  df$deaths_30_per_10k <- df$deaths_30/(df$population/10000)
  
  # create natural logged COVID fatality variables | make any negatives equal to 0
  # these occur when locals correct previously fatality counts and thus include daily negative values
  df$log_deaths <- log(df$deaths_per_10k + 1)
  df$log_deaths_30 <- ifelse(df$deaths_30_per_10k >= 0, log(df$deaths_30_per_10k + 1), 0)
  
  # create period indicator variables to indicate four periods in which the response options labeled as under-predictions remain the same
  df$period_1 <- ifelse(df$surveydate %in% c("2020-05-23", "2020-05-30", "2020-06-06", "2020-06-13"), 1, 0)
  df$period_2 <- ifelse(df$surveydate %in% c("2020-08-15", "2020-08-22", "2020-08-29", "2020-09-05", "2020-09-12"), 1, 0)
  df$period_3 <- ifelse(df$surveydate %in% c("2020-09-19", "2020-09-26", "2020-10-03", "2020-10-10", "2020-10-17", "2020-10-24", "2020-10-31", "2020-11-07"), 1, 0)
  
  # create survey design object
  d <- svydesign(ids = ~FIPS, weights = ~weight, data = df)
  
  
  ## Run regressions
  
  # under ~ logged cumulative deaths
  fit1 <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                   log_deaths*covid_nonpartisan_estimate + 
                   unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                   age + female + educ3 + race + inc3 + pid7 + ideo5, 
                 design = d, family = quasibinomial(link = "logit"), rescale = T)
  
  # under ~ logged deaths in last 30 days
  fit2 <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                   log_deaths_30*covid_nonpartisan_estimate + 
                   unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                   age + female + educ3 + race + inc3 + pid7 + ideo5, 
                 design = d, family = quasibinomial(link = "logit"), rescale = T)
  
  # over ~ logged cumulative deaths
  fit3 <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                   log_deaths*covid_nonpartisan_estimate + 
                   unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                   age + female + educ3 + race + inc3 + pid7 + ideo5, 
                 design = d, family = quasibinomial(link = "logit"), rescale = T)
  
  # over ~ logged deaths in last 30 days
  fit4 <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                   log_deaths_30*covid_nonpartisan_estimate + 
                   unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                   age + female + educ3 + race + inc3 + pid7 + ideo5, 
                 design = d, family = quasibinomial(link = "logit"), rescale = T)
  
  # extract values
  sum_fit1 <- as.data.frame(summary(fit1)$coefficient) %>% mutate(IV = row.names(.)) %>% filter(IV == "log_deaths:covid_nonpartisan_estimate") %>% dplyr::select(-`t value`)
  sum_fit2 <- as.data.frame(summary(fit2)$coefficient) %>% mutate(IV = row.names(.)) %>% filter(IV == "log_deaths_30:covid_nonpartisan_estimate") %>% dplyr::select(-`t value`)
  sum_fit3 <- as.data.frame(summary(fit3)$coefficient) %>% mutate(IV = row.names(.)) %>% filter(IV == "log_deaths:covid_nonpartisan_estimate") %>% dplyr::select(-`t value`)
  sum_fit4 <- as.data.frame(summary(fit4)$coefficient) %>% mutate(IV = row.names(.)) %>% filter(IV == "log_deaths_30:covid_nonpartisan_estimate") %>% dplyr::select(-`t value`)
  rownames(sum_fit1) <- rownames(sum_fit2) <- rownames(sum_fit3) <- rownames(sum_fit4) <- NULL
  
  # make table
  tab <- bind_rows(sum_fit1, sum_fit2, sum_fit3, sum_fit4) %>%
    rename(est = Estimate, se = `Std. Error`, pval = `Pr(>|t|)`) %>%
    mutate(est = round(est, 3),
           se = round(se, 3),
           pval = round(pval, 3),
           IV = if_else(IV == "log_deaths:covid_nonpartisan_estimate", "cumulative deaths", "deaths in last 30 days"),
           DV = c(rep("under", 2), rep("over", 2)),
           obs = c(nobs(fit1), nobs(fit2), nobs(fit3), nobs(fit4)),
           iter = rep(i, 4))
  
  # append to output
  output <- bind_rows(output, tab)
  
  # print i to keep track
  print(i)
  
}; rm(i, tab, sum_fit1, sum_fit2, sum_fit3, sum_fit4, fit1, fit2, fit3, fit4, df, crosswalk, seeds)

# plot distribution of estimates
p <- output %>%
  mutate(sig = if_else(pval < 0.05, "S", "NS"),
         DV = if_else(DV == "under", "underestimation", "overestimation")) %>%
  ggplot(aes(x = iter, y = est, color = sig)) + 
  geom_point() + 
  geom_errorbar(aes(ymin = est - 2*se, ymax = est + 2*se)) + 
  scale_color_manual(values = c("gray", "black")) + 
  facet_wrap(~ IV + DV) + 
  theme_bw() + 
  labs(x = "", y = "Coefficient estimate +/- 2 standard errors") + 
  theme(legend.position = "none",
        text = element_text(size = 18))

# save
ggsave(filename = "Figure A.15.png", plot = p, device = "png", path = "../../output", width = 18, height = 15, units = "in", bg = "white")

# save numbers for Appendix A.17
setwd("../../output")
writeLines(paste(paste("Min estimate value for under/cumulative deaths:", round(min(output[output$DV == "under" & output$IV == "cumulative deaths",]$est),3)),
                 paste("Max estimate value for under/cumulative deaths:", round(max(output[output$DV == "under" & output$IV == "cumulative deaths",]$est),3)),
                 paste("Mean estimate value for under/cumulative deaths:", round(mean(output[output$DV == "under" & output$IV == "cumulative deaths",]$est),3)),
                 paste("Percent of under/cumulative deaths that are negative and statistically significant:", sum(output[output$DV == "under" & output$IV == "cumulative deaths",]$est < 0 & output[output$DV == "under" & output$IV == "cumulative deaths",]$pval < 0.05)/250),
                 paste("Min estimate value for over/cumulative deaths:", round(min(output[output$DV == "over" & output$IV == "cumulative deaths",]$est),3)),
                 paste("Max estimate value for over/cumulative deaths:", round(max(output[output$DV == "over" & output$IV == "cumulative deaths",]$est),3)),
                 paste("Mean estimate value for over/cumulative deaths:", round(mean(output[output$DV == "over" & output$IV == "cumulative deaths",]$est),3)),
                 paste("Percent of over/cumulative deaths that are negative and statistically significant:", sum(output[output$DV == "over" & output$IV == "cumulative deaths",]$est < 0 & output[output$DV == "over" & output$IV == "cumulative deaths",]$pval < 0.05)/250),
                 paste("Min estimate value for under/deaths in last 30:", round(min(output[output$DV == "under" & output$IV == "deaths in last 30 days",]$est),3)),
                 paste("Max estimate value for under/deaths in last 30:", round(max(output[output$DV == "under" & output$IV == "deaths in last 30 days",]$est),3)),
                 paste("Mean estimate value for under/deaths in last 30:", round(mean(output[output$DV == "under" & output$IV == "deaths in last 30 days",]$est),3)),
                 paste("Percent of under/deaths in last 30 that are negative and statistically significant:", sum(output[output$DV == "under" & output$IV == "deaths in last 30 days",]$est < 0 & output[output$DV == "under" & output$IV == "deaths in last 30 days",]$pval < 0.05)/250),
                 paste("Min estimate value for over/deaths in last 30:", round(min(output[output$DV == "over" & output$IV == "deaths in last 30 days",]$est),3)),
                 paste("Max estimate value for over/deaths in last 30:", round(max(output[output$DV == "over" & output$IV == "deaths in last 30 days",]$est),3)),
                 paste("Mean estimate value for over/deaths in last 30:", round(mean(output[output$DV == "over" & output$IV == "deaths in last 30 days",]$est),3)),
                 paste("Percent of over/deaths in last 30 that are negative and statistically significant:", sum(output[output$DV == "over" & output$IV == "deaths in last 30 days",]$est < 0 & output[output$DV == "over" & output$IV == "deaths in last 30 days",]$pval < 0.05)/250),
                 sep = "\n"), "Appendix A.17.txt")

